|
import pandas as pd |
|
|
|
|
|
def format_docs(docs): |
|
"""Print the contents of a list of Langchain Documents. |
|
Args: |
|
docs (str): |
|
""" |
|
print( |
|
f"\n{'-' * 100}\n".join( |
|
[f"Document {i+1}:\n\n" + |
|
d.page_content for i, d in enumerate(docs)] |
|
) |
|
) |
|
|
|
|
|
def excel_to_dataframe(data_directory: str) -> pd.DataFrame: |
|
"""Load an Excel file, clean its contents, and generate a pd.Dataframe. |
|
|
|
Args: |
|
data_directory (str): File path to the directory where the Excel file is located. |
|
|
|
Raises: |
|
FileNotFoundError: If no Excel files are found in the specified directory. |
|
|
|
Returns: |
|
pd.Dataframe: |
|
|
|
""" |
|
|
|
excel_files = [file for file in data_directory.iterdir() |
|
if file.suffix == '.xlsx'] |
|
|
|
if not excel_files: |
|
raise FileNotFoundError( |
|
"No Excel files found in the specified directory.") |
|
if len(excel_files) > 1: |
|
raise ValueError( |
|
"More than one Excel file found in the specified directory.") |
|
|
|
path = excel_files[0] |
|
|
|
|
|
df = pd.read_excel(path, engine='openpyxl') |
|
|
|
|
|
df.columns = df.columns.str.title() |
|
|
|
|
|
def replace_apostrophes(text): |
|
if isinstance(text, str): |
|
return text.replace("\u2019", "'") |
|
return text |
|
|
|
|
|
|
|
for col in df.columns: |
|
|
|
if col.lower() != 'booking link' and df[col].dtype == 'object': |
|
|
|
df[col] = df[col].str.strip().str.title().apply(replace_apostrophes) |
|
|
|
|
|
df.fillna('Information Not Available', inplace=True) |
|
|
|
return df |
|
|