Spaces:
Sleeping
Sleeping
File size: 3,313 Bytes
1132b50 718e159 05f3fd9 1132b50 4dc1d14 67bfb80 6c9d07b 4dc1d14 6c9d07b 67bfb80 6c9d07b 67bfb80 e547dc0 6c9d07b 67bfb80 4dc1d14 718e159 1132b50 718e159 1132b50 718e159 1132b50 718e159 1132b50 718e159 1132b50 718e159 1132b50 718e159 78aafcc 1132b50 78aafcc 6c9d07b 718e159 6c9d07b 05f3fd9 6c9d07b 05f3fd9 0ddd09e 05f3fd9 6c9d07b 05f3fd9 6c9d07b e547dc0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 |
import pandas as pd
from langchain.docstore.document import Document
import re
SHEET_URL_X = "https://docs.google.com/spreadsheets/d/"
SHEET_URL_Y = "/edit#gid="
SHEET_URL_Y_EXPORT = "/export?gid="
SPLIT_PAGE_BREAKS = False
SYNONYMS = None
def get_id(sheet_url: str) -> str:
x = sheet_url.find(SHEET_URL_X)
y = sheet_url.find(SHEET_URL_Y)
return sheet_url[x + len(SHEET_URL_X) : y] + "-" + sheet_url[y + len(SHEET_URL_Y) :]
def xlsx_url(get_id: str) -> str:
y = get_id.rfind("-")
return SHEET_URL_X + get_id[0:y] + SHEET_URL_Y_EXPORT + get_id[y + 1 :]
def read_df(xlsx_url: str, page_content_column: str) -> pd.DataFrame:
df = pd.read_excel(xlsx_url, header=0, keep_default_na=False)
if SPLIT_PAGE_BREAKS:
df = split_page_breaks(df, page_content_column)
df = remove_empty_rows(df, page_content_column)
if SYNONYMS is not None:
df = duplicate_rows_with_synonyms(df, page_content_column, SYNONYMS)
return df
def split_page_breaks(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
split_values = df[column_name].str.split("\n")
new_df = pd.DataFrame({column_name: split_values.explode()})
new_df.reset_index(drop=True, inplace=True)
column_order = df.columns
new_df = new_df.reindex(column_order, axis=1)
other_columns = column_order.drop(column_name)
for column in other_columns:
new_df[column] = (
df[column].repeat(split_values.str.len()).reset_index(drop=True)
)
return new_df
def transform_documents_to_dataframe(documents: Document) -> pd.DataFrame:
keys = []
values = {"document_score": [], "page_content": []}
for doc, score in documents:
for key, value in doc.metadata.items():
if key not in keys:
keys.append(key)
values[key] = []
values[key].append(value)
values["document_score"].append(score)
values["page_content"].append(doc.page_content)
return pd.DataFrame(values)
def remove_duplicates_by_column(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
df.drop_duplicates(subset=column_name, inplace=True, ignore_index=True)
return df
def dataframe_to_dict(df: pd.DataFrame) -> dict:
df_records = df.to_dict(orient="records")
return df_records
def duplicate_rows_with_synonyms(df: pd.DataFrame, column: str, synonyms: list[list[str]]) -> pd.DataFrame:
new_rows = []
for index, row in df.iterrows():
new_rows.append(row)
text = row[column]
for synonym_list in synonyms:
for synonym in synonym_list:
pattern = r'(?i)\b({}(?:s|es|ed|ing)?)\b'.format(synonym)
if re.search(pattern, text):
for replacement in synonym_list:
if replacement != synonym:
new_row = row.copy()
new_row[column] = re.sub(pattern, replacement, text)
new_rows.append(new_row)
new_df = pd.DataFrame(new_rows, columns=df.columns)
new_df = new_df.reset_index(drop=True)
return new_df
def remove_empty_rows(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
df = df[df[column_name].str.strip().astype(bool)]
df = df.reset_index(drop=True)
return df
|