andreasmartin commited on
Commit
1132b50
·
1 Parent(s): 31d4f49

deepnote update

Browse files
Files changed (2) hide show
  1. app.py +16 -0
  2. util.py +49 -0
app.py CHANGED
@@ -1,6 +1,7 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import faq as faq
 
4
  import uvicorn
5
  import gradio as gr
6
 
@@ -21,6 +22,21 @@ async def ask_api(request: AskRequest):
21
  )
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @app.delete("/api/v1/")
25
  async def delete_vectordb_api():
26
  return delete_vectordb()
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  import faq as faq
4
+ import util as util
5
  import uvicorn
6
  import gradio as gr
7
 
 
22
  )
23
 
24
 
25
+ @app.post("/api/v2/ask")
26
+ async def ask_api(request: AskRequest):
27
+ faq_id = faq.faq_id(request.sheet_url)
28
+ xlsx_url = faq.xlsx_url(faq_id)
29
+ df = faq.read_df(xlsx_url)
30
+ df_update = util.split_page_breaks(df, request.page_content_column)
31
+ documents = faq.create_documents(df_update, request.page_content_column)
32
+ embedding_function = faq.define_embedding_function("sentence-transformers/all-mpnet-base-v2")
33
+ vectordb = faq.get_vectordb(faq_id=faq_id, embedding_function=embedding_function, documents=documents, vectordb_type=faq.VECTORDB_TYPE.Chroma)
34
+ documents = faq.similarity_search(vectordb, request.question, k=request.k)
35
+ df_doc = util.transform_documents_to_dataframe(documents)
36
+ df_filter = util.remove_duplicates_by_column(df_doc, "ID")
37
+ return util.serialize_dataframe_as_json(df_filter)
38
+
39
+
40
  @app.delete("/api/v1/")
41
  async def delete_vectordb_api():
42
  return delete_vectordb()
util.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def split_page_breaks(df, column_name):
4
+ split_values = df[column_name].str.split("\n")
5
+
6
+ new_df = pd.DataFrame({column_name: split_values.explode()})
7
+ new_df.reset_index(drop=True, inplace=True)
8
+
9
+ column_order = df.columns
10
+
11
+ new_df = new_df.reindex(column_order, axis=1)
12
+
13
+ other_columns = column_order.drop(column_name)
14
+ for column in other_columns:
15
+ new_df[column] = (
16
+ df[column].repeat(split_values.str.len()).reset_index(drop=True)
17
+ )
18
+
19
+ return new_df
20
+
21
+
22
+ def transform_documents_to_dataframe(documents):
23
+ metadata_keys = set()
24
+ for doc, _ in documents:
25
+ metadata_keys.update(doc.metadata.keys())
26
+
27
+ metadata_values = {key: [] for key in metadata_keys}
28
+ for doc, _ in documents:
29
+ for key, value in doc.metadata.items():
30
+ metadata_values[key].append(value)
31
+
32
+ metadata_values["Score"] = [score for _, score in documents]
33
+
34
+ df = pd.DataFrame(metadata_values)
35
+
36
+ return df
37
+
38
+
39
+ def remove_duplicates_by_column(df, column):
40
+ df.drop_duplicates(subset=column, inplace=True)
41
+ df.reset_index(drop=True, inplace=True)
42
+
43
+ return df
44
+
45
+
46
+ def serialize_dataframe_as_json(df):
47
+ json_array = df.to_dict(orient='records')
48
+
49
+ return json_array