faq / util.py
andreasmartin's picture
deepnote update
1132b50
raw
history blame
1.25 kB
import pandas as pd
def split_page_breaks(df, column_name):
split_values = df[column_name].str.split("\n")
new_df = pd.DataFrame({column_name: split_values.explode()})
new_df.reset_index(drop=True, inplace=True)
column_order = df.columns
new_df = new_df.reindex(column_order, axis=1)
other_columns = column_order.drop(column_name)
for column in other_columns:
new_df[column] = (
df[column].repeat(split_values.str.len()).reset_index(drop=True)
)
return new_df
def transform_documents_to_dataframe(documents):
metadata_keys = set()
for doc, _ in documents:
metadata_keys.update(doc.metadata.keys())
metadata_values = {key: [] for key in metadata_keys}
for doc, _ in documents:
for key, value in doc.metadata.items():
metadata_values[key].append(value)
metadata_values["Score"] = [score for _, score in documents]
df = pd.DataFrame(metadata_values)
return df
def remove_duplicates_by_column(df, column):
df.drop_duplicates(subset=column, inplace=True)
df.reset_index(drop=True, inplace=True)
return df
def serialize_dataframe_as_json(df):
json_array = df.to_dict(orient='records')
return json_array