File size: 2,959 Bytes
1132b50
718e159
1132b50
4dc1d14
 
 
67bfb80
6c9d07b
4dc1d14
 
 
 
 
 
 
 
 
 
 
 
 
6c9d07b
67bfb80
6c9d07b
67bfb80
6c9d07b
 
67bfb80
4dc1d14
 
718e159
1132b50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718e159
 
 
1132b50
718e159
1132b50
718e159
 
 
 
 
 
1132b50
718e159
1132b50
 
718e159
 
1132b50
 
 
 
718e159
78aafcc
1132b50
78aafcc
6c9d07b
 
718e159
6c9d07b
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import pandas as pd
from langchain.docstore.document import Document

SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
SHEET_URL_Y = "/edit#gid="
SHEET_URL_Y_EXPORT = "/export?gid="
SPLIT_PAGE_BREAKS = False
SYNONYMS = None


def get_id(sheet_url: str) -> str:
    x = sheet_url.find(SHEET_URL_X)
    y = sheet_url.find(SHEET_URL_Y)
    return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]


def xlsx_url(get_id: str) -> str:
    y = get_id.rfind("-")
    return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]


def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
    df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
    if SPLIT_PAGE_BREAKS:
        df = split_page_breaks(df, page_content_column)
    if SYNONYMS is not None:
        df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
    return df


def split_page_breaks(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    split_values = df[column_name].str.split("\n")

    new_df = pd.DataFrame({column_name: split_values.explode()})
    new_df.reset_index(drop=True, inplace=True)

    column_order = df.columns

    new_df = new_df.reindex(column_order, axis=1)

    other_columns = column_order.drop(column_name)
    for column in other_columns:
        new_df[column] = (
            df[column].repeat(split_values.str.len()).reset_index(drop=True)
        )

    return new_df


def transform_documents_to_dataframe(documents: Document) -> pd.DataFrame:
    keys = []
    values = {"document_score": [], "page_content": []}

    for doc, score in documents:
        for key, value in doc.metadata.items():
            if key not in keys:
                keys.append(key)
                values[key] = []
            values[key].append(value)
        values["document_score"].append(score)
        values["page_content"].append(doc.page_content)

    return pd.DataFrame(values)


def remove_duplicates_by_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df.drop_duplicates(subset=column_name, inplace=True, ignore_index=True)

    return df


def dataframe_to_dict(df: pd.DataFrame) -> dict:
    df_records = df.to_dict(orient="records")

    return df_records


def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[list[str]]) -> pd.DataFrame:
    new_rows = []
    for index, row in df.iterrows():
        new_rows.append(row)
        for synonym_list in synonyms:
            for word in row[column].split():
                if word in synonym_list:
                    for synonym in synonym_list:
                        if synonym != word:
                            new_row = row.copy()
                            new_row[column] = row[column].replace(word, synonym)
                            new_rows.append(new_row)
    new_df = pd.DataFrame(new_rows, columns=df.columns)
    new_df = new_df.reset_index(drop=True)
    return new_df