Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -120,31 +120,47 @@ def convert_to_list_of_dict(df: pd.DataFrame) -> List[Dict[str, str]]:
|
|
120 |
return result
|
121 |
|
122 |
|
123 |
-
|
124 |
-
# file_names = [f"
|
|
|
125 |
|
126 |
|
127 |
-
# Initialize an empty list to hold all documents
|
128 |
-
all_documents = [] # this is just a copy, you don't have to use this
|
129 |
|
130 |
-
# Iterate over each file and load its contents
|
131 |
-
for file_name in file_names:
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
|
136 |
-
# Split the loaded documents into chunks
|
137 |
-
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
138 |
-
docs = text_splitter.split_documents(all_documents)
|
139 |
|
140 |
-
# Create the open-source embedding function
|
141 |
-
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
142 |
-
# embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
|
143 |
-
# embedding_function = openai_text_embedding
|
144 |
|
145 |
-
# Load the documents into Chroma
|
146 |
-
db = Chroma.from_documents(docs, embedding_function)
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
|
149 |
st.title("Youth Homelessness Chatbot")
|
150 |
|
@@ -174,16 +190,34 @@ if prompt := st.chat_input("Tell me about YSA"):
|
|
174 |
question = prompt
|
175 |
|
176 |
with st.spinner("Wait for it..."):
|
177 |
-
|
178 |
-
|
179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
{
|
181 |
-
"
|
182 |
-
"
|
183 |
-
"
|
|
|
184 |
}
|
185 |
)
|
186 |
-
ref_from_db_search =
|
187 |
|
188 |
engineered_prompt = f"""
|
189 |
Based on the context: {ref_from_db_search},
|
@@ -199,9 +233,9 @@ if prompt := st.chat_input("Tell me about YSA"):
|
|
199 |
with st.spinner("Wait for it..."):
|
200 |
st.markdown(response)
|
201 |
with st.expander("See reference:"):
|
202 |
-
st.table(
|
203 |
# Add assistant response to chat history
|
204 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
205 |
st.session_state.messages.append(
|
206 |
-
{"role": "assistant", "content":
|
207 |
)
|
|
|
120 |
return result
|
121 |
|
122 |
|
123 |
+
## rag strategy 1
|
124 |
+
# file_names = [f"output_files/file_{i}.txt" for i in range(131)]
|
125 |
+
# # file_names = [f"output_files_large/file_{i}.txt" for i in range(1310)]
|
126 |
|
127 |
|
128 |
+
# # Initialize an empty list to hold all documents
|
129 |
+
# all_documents = [] # this is just a copy, you don't have to use this
|
130 |
|
131 |
+
# # Iterate over each file and load its contents
|
132 |
+
# for file_name in file_names:
|
133 |
+
# loader = TextLoader(file_name)
|
134 |
+
# documents = loader.load()
|
135 |
+
# all_documents.extend(documents)
|
136 |
|
137 |
+
# # Split the loaded documents into chunks
|
138 |
+
# text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
|
139 |
+
# docs = text_splitter.split_documents(all_documents)
|
140 |
|
141 |
+
# # Create the open-source embedding function
|
142 |
+
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
|
143 |
+
# # embedding_function = SentenceTransformer("all-MiniLM-L6-v2")
|
144 |
+
# # embedding_function = openai_text_embedding
|
145 |
|
146 |
+
# # Load the documents into Chroma
|
147 |
+
# db = Chroma.from_documents(docs, embedding_function)
|
148 |
|
149 |
+
## rag strategy 2
|
150 |
+
from datasets import load_dataset
|
151 |
+
dataset = load_dataset("eagle0504/youthless-homeless-shelter-web-scrape-dataset-qa-formatted")
|
152 |
+
|
153 |
+
import chromadb
|
154 |
+
client = chromadb.Client()
|
155 |
+
collection = client.create_collection("vector_database")
|
156 |
+
|
157 |
+
# Embed and store the first N supports for this demo
|
158 |
+
L = len(dataset["train"]['questions'])
|
159 |
+
collection.add(
|
160 |
+
ids=[str(i) for i in range(0, L)], # IDs are just strings
|
161 |
+
documents=dataset["train"]['questions'], # Enter questions here
|
162 |
+
metadatas=[{"type": "support"} for _ in range(0, L)],
|
163 |
+
)
|
164 |
|
165 |
st.title("Youth Homelessness Chatbot")
|
166 |
|
|
|
190 |
question = prompt
|
191 |
|
192 |
with st.spinner("Wait for it..."):
|
193 |
+
# strategy 1
|
194 |
+
# docs = db.similarity_search(question)
|
195 |
+
# docs_2 = db.similarity_search_with_score(question)
|
196 |
+
# docs_2_table = pd.DataFrame(
|
197 |
+
# {
|
198 |
+
# "source": [docs_2[i][0].metadata["source"] for i in range(len(docs))],
|
199 |
+
# "content": [docs_2[i][0].page_content for i in range(len(docs))],
|
200 |
+
# "distances": [docs_2[i][1] for i in range(len(docs))],
|
201 |
+
# }
|
202 |
+
# )
|
203 |
+
# ref_from_db_search = docs_2_table["content"]
|
204 |
+
|
205 |
+
# strategy 2
|
206 |
+
results = collection.query(
|
207 |
+
query_texts=user_query,
|
208 |
+
n_results=5
|
209 |
+
)
|
210 |
+
idx = results["ids"][0]
|
211 |
+
idx = [int(i) for i in idx]
|
212 |
+
ref = pd.DataFrame(
|
213 |
{
|
214 |
+
"idx": idx,
|
215 |
+
"question": [dataset["train"]['questions'][i] for i in idx],
|
216 |
+
"answers": [dataset["train"]['answers'][i] for i in idx],
|
217 |
+
"distances": results["distances"][0]
|
218 |
}
|
219 |
)
|
220 |
+
ref_from_db_search = ref["answers"]
|
221 |
|
222 |
engineered_prompt = f"""
|
223 |
Based on the context: {ref_from_db_search},
|
|
|
233 |
with st.spinner("Wait for it..."):
|
234 |
st.markdown(response)
|
235 |
with st.expander("See reference:"):
|
236 |
+
st.table(ref)
|
237 |
# Add assistant response to chat history
|
238 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
239 |
st.session_state.messages.append(
|
240 |
+
{"role": "assistant", "content": ref.to_json()}
|
241 |
)
|