Commit
·
0c818aa
1
Parent(s):
a73f005
Returned to bge-base embeddings for now. Improved UI a little
Browse files- app.py +29 -29
- chatfuncs/ingest.py +27 -32
- chatfuncs/ingest_borough_plan.py +1 -1
app.py
CHANGED
@@ -45,7 +45,7 @@ import chatfuncs.ingest as ing
|
|
45 |
# Load preset embeddings, vectorstore, and model
|
46 |
###
|
47 |
|
48 |
-
embeddings_name =
|
49 |
|
50 |
def load_embeddings(embeddings_name = embeddings_name):
|
51 |
|
@@ -185,7 +185,7 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
|
|
185 |
# RUN UI
|
186 |
###
|
187 |
|
188 |
-
app = gr.Blocks(theme = gr.themes.Base())#css=".gradio-container {background-color: black}")
|
189 |
|
190 |
with app:
|
191 |
ingest_text = gr.State()
|
@@ -243,9 +243,9 @@ with app:
|
|
243 |
lines=1,
|
244 |
)
|
245 |
with gr.Row():
|
246 |
-
submit = gr.Button(value="Send message", variant="
|
247 |
-
clear = gr.Button(value="Clear chat", variant="secondary", scale=
|
248 |
-
stop = gr.Button(value="Stop generating", variant="secondary", scale=
|
249 |
|
250 |
examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
|
251 |
#value = "What were the five pillars of the previous borough plan?",
|
@@ -296,45 +296,45 @@ with app:
|
|
296 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
297 |
|
298 |
change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
|
304 |
# Load in a pdf
|
305 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
|
310 |
# Load in a webpage
|
311 |
load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
|
316 |
# Load in a csv/excel file
|
317 |
load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
|
322 |
# Load in a webpage
|
323 |
|
324 |
# Click/enter to send message action
|
325 |
response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
|
326 |
-
|
327 |
-
|
328 |
-
response_click.
|
329 |
-
|
330 |
-
|
331 |
|
332 |
response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
|
333 |
-
|
334 |
-
|
335 |
-
response_enter.
|
336 |
-
|
337 |
-
|
338 |
|
339 |
# Stop box
|
340 |
stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
|
@@ -356,7 +356,7 @@ with app:
|
|
356 |
access_callback.setup([session_hash_textbox], access_logs_data_folder)
|
357 |
|
358 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
359 |
-
|
360 |
|
361 |
# Launch the Gradio app
|
362 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
|
|
45 |
# Load preset embeddings, vectorstore, and model
|
46 |
###
|
47 |
|
48 |
+
embeddings_name = "BAAI/bge-base-en-v1.5" #"mixedbread-ai/mxbai-embed-xsmall-v1"
|
49 |
|
50 |
def load_embeddings(embeddings_name = embeddings_name):
|
51 |
|
|
|
185 |
# RUN UI
|
186 |
###
|
187 |
|
188 |
+
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)#css=".gradio-container {background-color: black}")
|
189 |
|
190 |
with app:
|
191 |
ingest_text = gr.State()
|
|
|
243 |
lines=1,
|
244 |
)
|
245 |
with gr.Row():
|
246 |
+
submit = gr.Button(value="Send message", variant="primary", scale = 4)
|
247 |
+
clear = gr.Button(value="Clear chat", variant="secondary", scale=1)
|
248 |
+
stop = gr.Button(value="Stop generating", variant="secondary", scale=1)
|
249 |
|
250 |
examples_set = gr.Radio(label="Examples for the Lambeth Borough Plan",
|
251 |
#value = "What were the five pillars of the previous borough plan?",
|
|
|
296 |
examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
|
297 |
|
298 |
change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
299 |
+
success(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
|
300 |
+
success(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
|
301 |
+
success(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
|
302 |
+
success(lambda: None, None, chatbot, queue=False)
|
303 |
|
304 |
# Load in a pdf
|
305 |
load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
|
306 |
+
success(ing.text_to_docs, inputs=[ingest_text], outputs=[ingest_docs]).\
|
307 |
+
success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
|
308 |
+
success(chatf.hide_block, outputs = [examples_set])
|
309 |
|
310 |
# Load in a webpage
|
311 |
load_web_click = load_web.click(ing.parse_html, inputs=[in_web, in_div], outputs=[ingest_text, ingest_metadata, current_source]).\
|
312 |
+
success(ing.html_text_to_docs, inputs=[ingest_text, ingest_metadata], outputs=[ingest_docs]).\
|
313 |
+
success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
|
314 |
+
success(chatf.hide_block, outputs = [examples_set])
|
315 |
|
316 |
# Load in a csv/excel file
|
317 |
load_csv_click = load_csv.click(ing.parse_csv_or_excel, inputs=[in_csv, in_text_column], outputs=[ingest_text, current_source]).\
|
318 |
+
success(ing.csv_excel_text_to_docs, inputs=[ingest_text, in_text_column], outputs=[ingest_docs]).\
|
319 |
+
success(embed_faiss_save_to_zip, inputs=[ingest_docs], outputs=[ingest_embed_out, vectorstore_state, file_out_box]).\
|
320 |
+
success(chatf.hide_block, outputs = [examples_set])
|
321 |
|
322 |
# Load in a webpage
|
323 |
|
324 |
# Click/enter to send message action
|
325 |
response_click = submit.click(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False, api_name="retrieval").\
|
326 |
+
success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
327 |
+
success(chatf.produce_streaming_answer_chatbot, inputs=[chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], outputs=chatbot)
|
328 |
+
response_click.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
329 |
+
success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
330 |
+
success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
|
331 |
|
332 |
response_enter = message.submit(chatf.create_full_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state, model_type_state, out_passages, api_model_choice, in_api_key], outputs=[chat_history_state, sources, instruction_prompt_out, relevant_query_state], queue=False).\
|
333 |
+
success(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
|
334 |
+
success(chatf.produce_streaming_answer_chatbot, [chatbot, instruction_prompt_out, model_type_state, temp_slide, relevant_query_state], chatbot)
|
335 |
+
response_enter.success(chatf.highlight_found_text, [chatbot, sources], [sources]).\
|
336 |
+
success(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
|
337 |
+
success(lambda: chatf.restore_interactivity(), None, [message], queue=False)
|
338 |
|
339 |
# Stop box
|
340 |
stop.click(fn=None, inputs=None, outputs=None, cancels=[response_click, response_enter])
|
|
|
356 |
access_callback.setup([session_hash_textbox], access_logs_data_folder)
|
357 |
|
358 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
359 |
+
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
360 |
|
361 |
# Launch the Gradio app
|
362 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
chatfuncs/ingest.py
CHANGED
@@ -7,6 +7,7 @@ import requests
|
|
7 |
import pandas as pd
|
8 |
import dateutil.parser
|
9 |
from typing import Type, List
|
|
|
10 |
|
11 |
from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
|
12 |
from langchain_community.vectorstores.faiss import FAISS
|
@@ -573,56 +574,50 @@ def load_embeddings(model_name = "BAAI/bge-base-en-v1.5"):
|
|
573 |
|
574 |
return embeddings_func
|
575 |
|
576 |
-
def embed_faiss_save_to_zip(docs_out, save_to="output", model_name
|
577 |
-
|
578 |
load_embeddings(model_name=model_name)
|
579 |
|
580 |
-
#embeddings_fast = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
581 |
-
|
582 |
print(f"> Total split documents: {len(docs_out)}")
|
583 |
|
584 |
vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
585 |
|
586 |
-
|
587 |
-
|
588 |
|
589 |
-
|
590 |
-
vectorstore.save_local(folder_path=save_to)
|
591 |
|
592 |
-
print(">
|
593 |
print(f"> Saved to: {save_to}")
|
594 |
|
595 |
-
|
|
|
|
|
596 |
|
597 |
-
|
|
|
598 |
|
599 |
-
|
|
|
|
|
|
|
|
|
600 |
|
601 |
-
|
602 |
-
|
603 |
|
604 |
-
|
|
|
|
|
605 |
|
606 |
-
|
|
|
607 |
|
608 |
-
|
|
|
|
|
|
|
609 |
|
610 |
-
return out_message, vectorstore, save_zip_out
|
611 |
|
612 |
-
def docs_to_chroma_save(embeddings, docs_out:PandasDataFrame, save_to:str):
|
613 |
-
print(f"> Total split documents: {len(docs_out)}")
|
614 |
-
|
615 |
-
vectordb = Chroma.from_documents(documents=docs_out,
|
616 |
-
embedding=embeddings,
|
617 |
-
persist_directory=save_to)
|
618 |
-
|
619 |
-
# persiste the db to disk
|
620 |
-
vectordb.persist()
|
621 |
-
|
622 |
-
print("> DONE")
|
623 |
-
print(f"> Saved to: {save_to}")
|
624 |
-
|
625 |
-
return vectordb
|
626 |
|
627 |
def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
|
628 |
|
|
|
7 |
import pandas as pd
|
8 |
import dateutil.parser
|
9 |
from typing import Type, List
|
10 |
+
import shutil
|
11 |
|
12 |
from langchain_community.embeddings import HuggingFaceEmbeddings # HuggingFaceInstructEmbeddings,
|
13 |
from langchain_community.vectorstores.faiss import FAISS
|
|
|
574 |
|
575 |
return embeddings_func
|
576 |
|
577 |
+
def embed_faiss_save_to_zip(docs_out, save_to="output", model_name="BAAI/bge-base-en-v1.5"):
|
|
|
578 |
load_embeddings(model_name=model_name)
|
579 |
|
|
|
|
|
580 |
print(f"> Total split documents: {len(docs_out)}")
|
581 |
|
582 |
vectorstore = FAISS.from_documents(documents=docs_out, embedding=embeddings)
|
583 |
|
584 |
+
save_to_path = Path(save_to)
|
585 |
+
save_to_path.mkdir(parents=True, exist_ok=True)
|
586 |
|
587 |
+
vectorstore.save_local(folder_path=str(save_to_path))
|
|
|
588 |
|
589 |
+
print("> FAISS index saved")
|
590 |
print(f"> Saved to: {save_to}")
|
591 |
|
592 |
+
# Ensure files are written before archiving
|
593 |
+
index_faiss = save_to_path / "index.faiss"
|
594 |
+
index_pkl = save_to_path / "index.pkl"
|
595 |
|
596 |
+
if not index_faiss.exists() or not index_pkl.exists():
|
597 |
+
raise FileNotFoundError("Expected FAISS index files not found before zipping.")
|
598 |
|
599 |
+
# Flush file system writes by forcing a sync (works best on Unix)
|
600 |
+
try:
|
601 |
+
os.sync()
|
602 |
+
except AttributeError:
|
603 |
+
pass # os.sync() not available on Windows
|
604 |
|
605 |
+
# Create ZIP archive
|
606 |
+
final_zip_path = shutil.make_archive(str(save_to_path), 'zip', root_dir=str(save_to_path))
|
607 |
|
608 |
+
# Remove individual index files to avoid leaking large raw files
|
609 |
+
index_faiss.unlink(missing_ok=True)
|
610 |
+
index_pkl.unlink(missing_ok=True)
|
611 |
|
612 |
+
# Move ZIP inside the folder for easier reference
|
613 |
+
#final_zip_path = save_to_path.with_suffix('.zip')
|
614 |
|
615 |
+
print("> Archive complete")
|
616 |
+
print(f"> Final ZIP path: {final_zip_path}")
|
617 |
+
|
618 |
+
return "Document processing complete", vectorstore, final_zip_path
|
619 |
|
|
|
620 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
621 |
|
622 |
def sim_search_local_saved_vec(query, k_val, save_to="faiss_lambeth_census_embedding"):
|
623 |
|
chatfuncs/ingest_borough_plan.py
CHANGED
@@ -8,7 +8,7 @@ print(borough_plan_text)
|
|
8 |
borough_plan_docs = ing.text_to_docs(borough_plan_text)
|
9 |
print("Borough plan docs created")
|
10 |
|
11 |
-
embedding_model =
|
12 |
|
13 |
embeddings = ing.load_embeddings(model_name = embedding_model)
|
14 |
ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
|
|
|
8 |
borough_plan_docs = ing.text_to_docs(borough_plan_text)
|
9 |
print("Borough plan docs created")
|
10 |
|
11 |
+
embedding_model = "BAAI/bge-base-en-v1.5" # "mixedbread-ai/mxbai-embed-xsmall-v1" #
|
12 |
|
13 |
embeddings = ing.load_embeddings(model_name = embedding_model)
|
14 |
ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)
|