Spaces:

eagle0504
/

YSA-Larkin-Comm

Sleeping

App Files Files Community

eagle0504 commited on Feb 14, 2024

Commit

38a30d6

verified ·

1 Parent(s): dff518b

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -27

app.py CHANGED Viewed

@@ -120,31 +120,47 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
     return result
-file_names = [f"output_files/file_{i}.txt" for i in range(131)]
-# file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
-# Initialize an empty list to hold all documents
-all_documents = []  # this is just a copy, you don't have to use this
-# Iterate over each file and load its contents
-for file_name in file_names:
-    loader = TextLoader(file_name)
-    documents = loader.load()
-    all_documents.extend(documents)
-# Split the loaded documents into chunks
-text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
-docs = text_splitter.split_documents(all_documents)
-# Create the open-source embedding function
-embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
-# embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
-# embedding_function = openai_text_embedding
-# Load the documents into Chroma
-db = Chroma.from_documents(docs, embedding_function)
 st.title("Youth Homelessness Chatbot")
@@ -174,16 +190,34 @@ if prompt := st.chat_input("Tell me about YSA"):
     question = prompt
     with st.spinner("Wait for it..."):
-        docs = db.similarity_search(question)
-        docs_2 = db.similarity_search_with_score(question)
-        docs_2_table = pd.DataFrame(
             {
-                "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))],
-                "content": [docs_2[i][0].page_content for i in range(len(docs))],
-                "distances": [docs_2[i][1] for i in range(len(docs))],
             }
         )
-        ref_from_db_search = docs_2_table["content"]
         engineered_prompt = f"""
             Based on the context: {ref_from_db_search},
@@ -199,9 +233,9 @@ if prompt := st.chat_input("Tell me about YSA"):
         with st.spinner("Wait for it..."):
             st.markdown(response)
             with st.expander("See reference:"):
-                st.table(docs_2_table)
     # Add assistant response to chat history
     st.session_state.messages.append({"role": "assistant", "content": response})
     st.session_state.messages.append(
-        {"role": "assistant", "content": docs_2_table.to_json()}
     )

     return result
+## rag strategy 1
+# file_names = [f"output_files/file_{i}.txt" for i in range(131)]
+# # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
+# # Initialize an empty list to hold all documents
+# all_documents = []  # this is just a copy, you don't have to use this
+# # Iterate over each file and load its contents
+# for file_name in file_names:
+#     loader = TextLoader(file_name)
+#     documents = loader.load()
+#     all_documents.extend(documents)
+# # Split the loaded documents into chunks
+# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+# docs = text_splitter.split_documents(all_documents)
+# # Create the open-source embedding function
+# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+# # embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
+# # embedding_function = openai_text_embedding
+# # Load the documents into Chroma
+# db = Chroma.from_documents(docs, embedding_function)
+## rag strategy 2
+from datasets import load_dataset
+dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
+import chromadb
+client = chromadb.Client()
+collection = client.create_collection("vector_database")
+# Embed and store the first N supports for this demo
+L = len(dataset["train"]['questions'])
+collection.add(
+    ids=[str(i) for i in range(0, L)],  # IDs are just strings
+    documents=dataset["train"]['questions'], # Enter questions here
+    metadatas=[{"type": "support"} for _ in range(0, L)],
+)
 st.title("Youth Homelessness Chatbot")
     question = prompt
     with st.spinner("Wait for it..."):
+        # strategy 1
+        # docs = db.similarity_search(question)
+        # docs_2 = db.similarity_search_with_score(question)
+        # docs_2_table = pd.DataFrame(
+        #     {
+        #         "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))],
+        #         "content": [docs_2[i][0].page_content for i in range(len(docs))],
+        #         "distances": [docs_2[i][1] for i in range(len(docs))],
+        #     }
+        # )
+        # ref_from_db_search = docs_2_table["content"]
+        # strategy 2
+        results = collection.query(
+            query_texts=user_query,
+            n_results=5
+        )
+        idx = results["ids"][0]
+        idx = [int(i) for i in idx]
+        ref = pd.DataFrame(
             {
+                "idx": idx,
+                "question": [dataset["train"]['questions'][i] for i in idx],
+                "answers": [dataset["train"]['answers'][i] for i in idx],
+                "distances": results["distances"][0]
             }
         )
+        ref_from_db_search = ref["answers"]
         engineered_prompt = f"""
             Based on the context: {ref_from_db_search},
         with st.spinner("Wait for it..."):
             st.markdown(response)
             with st.expander("See reference:"):
+                st.table(ref)
     # Add assistant response to chat history
     st.session_state.messages.append({"role": "assistant", "content": response})
     st.session_state.messages.append(
+        {"role": "assistant", "content": ref.to_json()}
     )