File size: 3,524 Bytes
4cdba7b
5005601
 
 
 
 
 
 
 
6323bc8
5005601
8c2f0ba
 
 
 
 
6323bc8
5005601
 
 
8c2f0ba
 
 
5005601
 
96db48f
5005601
8c2f0ba
5005601
 
 
 
 
 
 
 
 
 
 
6017dce
5005601
 
 
8c2f0ba
5005601
 
 
6017dce
8c2f0ba
5005601
 
96db48f
8c2f0ba
96db48f
 
 
5005601
96db48f
 
5005601
96db48f
8c2f0ba
5005601
 
 
 
8c2f0ba
6017dce
8c2f0ba
5005601
8c2f0ba
6017dce
 
6323bc8
 
 
 
 
 
6017dce
 
 
 
6323bc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6017dce
 
 
 
 
6323bc8
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import pandas as pd
from langchain.document_loaders import DataFrameLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import AwaDB
from typing import List, Tuple
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
import os
import shutil

SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
SHEET_URL_Y = "/edit#gid="
SHEET_URL_Y_EXPORT = "/export?gid="
CACHE_FOLDER = ".embedding-model"
VECTORDB_FOLDER = ".vectordb"
EMBEDDING_MODEL = "sentence-transformers/all-mpnet-base-v2"


def faq_id(sheet_url: str) -> str:
    x = sheet_url.find(SHEET_URL_X)
    y = sheet_url.find(SHEET_URL_Y)
    return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]


def xlsx_url(faq_id: str) -> str:
    y = faq_id.rfind("-")
    return SHEET_URL_X + faq_id[0:y] + SHEET_URL_Y_EXPORT + faq_id[y + 1 :]


def read_df(xlsx_url: str) -> pd.DataFrame:
    return pd.read_excel(xlsx_url, header=0, keep_default_na=False)


def create_documents(df: pd.DataFrame, page_content_column: str) -> pd.DataFrame:
    loader = DataFrameLoader(df, page_content_column=page_content_column)
    return loader.load()


def define_embedding_function(model_name: str) -> HuggingFaceEmbeddings:
    return HuggingFaceEmbeddings(
        model_name=model_name,
        encode_kwargs={"normalize_embeddings": True},
        cache_folder=CACHE_FOLDER,
    )


def get_vectordb(
    faq_id: str, embedding_function: Embeddings, documents: List[Document] = None
) -> VectorStore:
    vectordb = None
    if documents is None:
        vectordb = AwaDB(embedding=embedding_function, log_and_data_dir=VECTORDB_FOLDER)
        success = vectordb.load_local(table_name=faq_id)
        if not success:
            raise Exception("faq_id may not exists")
    else:
        vectordb = AwaDB.from_documents(
            documents=documents,
            embedding=embedding_function,
            table_name=faq_id,
            log_and_data_dir=VECTORDB_FOLDER,
        )
    return vectordb


def similarity_search(
    vectordb: VectorStore, query: str, k: int = 3
) -> List[Tuple[Document, float]]:
    os.environ["TOKENIZERS_PARALLELISM"] = "true"
    return vectordb.similarity_search_with_relevance_scores(query=query, k=k)


def load_vectordb_id(
    faq_id: str,
    page_content_column: str,
    embedding_function_name: str = EMBEDDING_MODEL,
) -> VectorStore:
    embedding_function = define_embedding_function(embedding_function_name)
    vectordb = None
    try:
        vectordb = get_vectordb(faq_id=faq_id, embedding_function=embedding_function)
    except Exception as e:
        vectordb = create_vectordb_id(faq_id, page_content_column, embedding_function)

    return vectordb


def create_vectordb_id(
    faq_id: str,
    page_content_column: str,
    embedding_function: HuggingFaceEmbeddings = None,
) -> VectorStore:
    if embedding_function is None:
        embedding_function = define_embedding_function(EMBEDDING_MODEL)

    df = read_df(xlsx_url(faq_id))
    documents = create_documents(df, page_content_column)
    vectordb = get_vectordb(
        faq_id=faq_id, embedding_function=embedding_function, documents=documents
    )
    return vectordb


def load_vectordb(sheet_url: str, page_content_column: str) -> VectorStore:
    return load_vectordb_id(faq_id(sheet_url), page_content_column)


def delete_vectordb():
    shutil.rmtree(VECTORDB_FOLDER, ignore_errors=True)