File size: 3,313 Bytes
1132b50
718e159
05f3fd9
1132b50
4dc1d14
 
 
67bfb80
6c9d07b
4dc1d14
 
 
 
 
 
 
 
 
 
 
 
 
6c9d07b
67bfb80
6c9d07b
67bfb80
e547dc0
6c9d07b
 
67bfb80
4dc1d14
 
718e159
1132b50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
718e159
 
 
1132b50
718e159
1132b50
718e159
 
 
 
 
 
1132b50
718e159
1132b50
 
718e159
 
1132b50
 
 
 
718e159
78aafcc
1132b50
78aafcc
6c9d07b
 
718e159
6c9d07b
 
 
05f3fd9
6c9d07b
05f3fd9
0ddd09e
05f3fd9
 
 
6c9d07b
05f3fd9
6c9d07b
 
 
 
e547dc0
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pandas as pd
from langchain.docstore.document import Document
import re

SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
SHEET_URL_Y = "/edit#gid="
SHEET_URL_Y_EXPORT = "/export?gid="
SPLIT_PAGE_BREAKS = False
SYNONYMS = None


def get_id(sheet_url: str) -> str:
    x = sheet_url.find(SHEET_URL_X)
    y = sheet_url.find(SHEET_URL_Y)
    return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]


def xlsx_url(get_id: str) -> str:
    y = get_id.rfind("-")
    return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]


def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
    df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
    if SPLIT_PAGE_BREAKS:
        df = split_page_breaks(df, page_content_column)
    df = remove_empty_rows(df, page_content_column)
    if SYNONYMS is not None:
        df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
    return df


def split_page_breaks(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    split_values = df[column_name].str.split("\n")

    new_df = pd.DataFrame({column_name: split_values.explode()})
    new_df.reset_index(drop=True, inplace=True)

    column_order = df.columns

    new_df = new_df.reindex(column_order, axis=1)

    other_columns = column_order.drop(column_name)
    for column in other_columns:
        new_df[column] = (
            df[column].repeat(split_values.str.len()).reset_index(drop=True)
        )

    return new_df


def transform_documents_to_dataframe(documents: Document) -> pd.DataFrame:
    keys = []
    values = {"document_score": [], "page_content": []}

    for doc, score in documents:
        for key, value in doc.metadata.items():
            if key not in keys:
                keys.append(key)
                values[key] = []
            values[key].append(value)
        values["document_score"].append(score)
        values["page_content"].append(doc.page_content)

    return pd.DataFrame(values)


def remove_duplicates_by_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df.drop_duplicates(subset=column_name, inplace=True, ignore_index=True)

    return df


def dataframe_to_dict(df: pd.DataFrame) -> dict:
    df_records = df.to_dict(orient="records")

    return df_records


def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[list[str]]) -> pd.DataFrame:
    new_rows = []
    for index, row in df.iterrows():
        new_rows.append(row)
        text = row[column]
        for synonym_list in synonyms:
            for synonym in synonym_list:
                pattern = r'(?i)\b({}(?:s|es|ed|ing)?)\b'.format(synonym)
                if re.search(pattern, text):
                    for replacement in synonym_list:
                        if replacement != synonym:
                            new_row = row.copy()
                            new_row[column] = re.sub(pattern, replacement, text)
                            new_rows.append(new_row)
    new_df = pd.DataFrame(new_rows, columns=df.columns)
    new_df = new_df.reset_index(drop=True)
    return new_df


def remove_empty_rows(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df = df[df[column_name].str.strip().astype(bool)]
    df = df.reset_index(drop=True)
    return df