File size: 1,976 Bytes
1132b50
 
4dc1d14
 
 
67bfb80
4dc1d14
 
 
 
 
 
 
 
 
 
 
 
 
67bfb80
 
 
 
 
4dc1d14
 
1132b50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4dc1d14
 
1132b50
4dc1d14
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd

SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
SHEET_URL_Y = "/edit#gid="
SHEET_URL_Y_EXPORT = "/export?gid="
SPLIT_PAGE_BREAKS = False


def get_id(sheet_url: str) -> str:
    x = sheet_url.find(SHEET_URL_X)
    y = sheet_url.find(SHEET_URL_Y)
    return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]


def xlsx_url(get_id: str) -> str:
    y = get_id.rfind("-")
    return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]


def read_df(xlsx_url: str, split_page_breaks: bool = SPLIT_PAGE_BREAKS) -> pd.DataFrame:
    df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
    if split_page_breaks:
        df = split_page_breaks(df, page_content_column)
    return df


def split_page_breaks(df, column_name):
    split_values = df[column_name].str.split("\n")

    new_df = pd.DataFrame({column_name: split_values.explode()})
    new_df.reset_index(drop=True, inplace=True)

    column_order = df.columns

    new_df = new_df.reindex(column_order, axis=1)

    other_columns = column_order.drop(column_name)
    for column in other_columns:
        new_df[column] = (
            df[column].repeat(split_values.str.len()).reset_index(drop=True)
        )

    return new_df


def transform_documents_to_dataframe(documents):
    metadata_keys = set()
    for doc, _ in documents:
        metadata_keys.update(doc.metadata.keys())

    metadata_values = {key: [] for key in metadata_keys}
    for doc, _ in documents:
        for key, value in doc.metadata.items():
            metadata_values[key].append(value)

    metadata_values["Score"] = [score for _, score in documents]

    df = pd.DataFrame(metadata_values)

    return df


def remove_duplicates_by_column(df, column):
    df.drop_duplicates(subset=column, inplace=True)
    df.reset_index(drop=True, inplace=True)

    return df


def dataframe_to_dict(df):
    df_records = df.to_dict(orient='records')

    return df_records