Spaces:
Sleeping
Sleeping
import pandas as pd | |
def split_page_breaks(df, column_name): | |
split_values = df[column_name].str.split("\n") | |
new_df = pd.DataFrame({column_name: split_values.explode()}) | |
new_df.reset_index(drop=True, inplace=True) | |
column_order = df.columns | |
new_df = new_df.reindex(column_order, axis=1) | |
other_columns = column_order.drop(column_name) | |
for column in other_columns: | |
new_df[column] = ( | |
df[column].repeat(split_values.str.len()).reset_index(drop=True) | |
) | |
return new_df | |
def transform_documents_to_dataframe(documents): | |
metadata_keys = set() | |
for doc, _ in documents: | |
metadata_keys.update(doc.metadata.keys()) | |
metadata_values = {key: [] for key in metadata_keys} | |
for doc, _ in documents: | |
for key, value in doc.metadata.items(): | |
metadata_values[key].append(value) | |
metadata_values["Score"] = [score for _, score in documents] | |
df = pd.DataFrame(metadata_values) | |
return df | |
def remove_duplicates_by_column(df, column): | |
df.drop_duplicates(subset=column, inplace=True) | |
df.reset_index(drop=True, inplace=True) | |
return df | |
def serialize_dataframe_as_json(df): | |
json_array = df.to_dict(orient='records') | |
return json_array |