Spaces:
Sleeping
Sleeping
Commit
·
1132b50
1
Parent(s):
31d4f49
deepnote update
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from fastapi import FastAPI
|
2 |
from pydantic import BaseModel
|
3 |
import faq as faq
|
|
|
4 |
import uvicorn
|
5 |
import gradio as gr
|
6 |
|
@@ -21,6 +22,21 @@ async def ask_api(request: AskRequest):
|
|
21 |
)
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
@app.delete("/api/v1/")
|
25 |
async def delete_vectordb_api():
|
26 |
return delete_vectordb()
|
|
|
1 |
from fastapi import FastAPI
|
2 |
from pydantic import BaseModel
|
3 |
import faq as faq
|
4 |
+
import util as util
|
5 |
import uvicorn
|
6 |
import gradio as gr
|
7 |
|
|
|
22 |
)
|
23 |
|
24 |
|
25 |
+
@app.post("/api/v2/ask")
|
26 |
+
async def ask_api(request: AskRequest):
|
27 |
+
faq_id = faq.faq_id(request.sheet_url)
|
28 |
+
xlsx_url = faq.xlsx_url(faq_id)
|
29 |
+
df = faq.read_df(xlsx_url)
|
30 |
+
df_update = util.split_page_breaks(df, request.page_content_column)
|
31 |
+
documents = faq.create_documents(df_update, request.page_content_column)
|
32 |
+
embedding_function = faq.define_embedding_function("sentence-transformers/all-mpnet-base-v2")
|
33 |
+
vectordb = faq.get_vectordb(faq_id=faq_id, embedding_function=embedding_function, documents=documents, vectordb_type=faq.VECTORDB_TYPE.Chroma)
|
34 |
+
documents = faq.similarity_search(vectordb, request.question, k=request.k)
|
35 |
+
df_doc = util.transform_documents_to_dataframe(documents)
|
36 |
+
df_filter = util.remove_duplicates_by_column(df_doc, "ID")
|
37 |
+
return util.serialize_dataframe_as_json(df_filter)
|
38 |
+
|
39 |
+
|
40 |
@app.delete("/api/v1/")
|
41 |
async def delete_vectordb_api():
|
42 |
return delete_vectordb()
|
util.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
|
3 |
+
def split_page_breaks(df, column_name):
|
4 |
+
split_values = df[column_name].str.split("\n")
|
5 |
+
|
6 |
+
new_df = pd.DataFrame({column_name: split_values.explode()})
|
7 |
+
new_df.reset_index(drop=True, inplace=True)
|
8 |
+
|
9 |
+
column_order = df.columns
|
10 |
+
|
11 |
+
new_df = new_df.reindex(column_order, axis=1)
|
12 |
+
|
13 |
+
other_columns = column_order.drop(column_name)
|
14 |
+
for column in other_columns:
|
15 |
+
new_df[column] = (
|
16 |
+
df[column].repeat(split_values.str.len()).reset_index(drop=True)
|
17 |
+
)
|
18 |
+
|
19 |
+
return new_df
|
20 |
+
|
21 |
+
|
22 |
+
def transform_documents_to_dataframe(documents):
|
23 |
+
metadata_keys = set()
|
24 |
+
for doc, _ in documents:
|
25 |
+
metadata_keys.update(doc.metadata.keys())
|
26 |
+
|
27 |
+
metadata_values = {key: [] for key in metadata_keys}
|
28 |
+
for doc, _ in documents:
|
29 |
+
for key, value in doc.metadata.items():
|
30 |
+
metadata_values[key].append(value)
|
31 |
+
|
32 |
+
metadata_values["Score"] = [score for _, score in documents]
|
33 |
+
|
34 |
+
df = pd.DataFrame(metadata_values)
|
35 |
+
|
36 |
+
return df
|
37 |
+
|
38 |
+
|
39 |
+
def remove_duplicates_by_column(df, column):
|
40 |
+
df.drop_duplicates(subset=column, inplace=True)
|
41 |
+
df.reset_index(drop=True, inplace=True)
|
42 |
+
|
43 |
+
return df
|
44 |
+
|
45 |
+
|
46 |
+
def serialize_dataframe_as_json(df):
|
47 |
+
json_array = df.to_dict(orient='records')
|
48 |
+
|
49 |
+
return json_array
|