|
|
|
|
|
import os |
|
import time |
|
import re |
|
import ast |
|
import pandas as pd |
|
import gradio as gr |
|
from typing import Type, List, Literal |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
|
|
from pydantic import BaseModel, Field |
|
|
|
|
|
PandasDataFrame = Type[pd.DataFrame] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Document(BaseModel): |
|
"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py""" |
|
|
|
page_content: str |
|
"""String text.""" |
|
metadata: dict = Field(default_factory=dict) |
|
"""Arbitrary metadata about the page content (e.g., source, relationships to other |
|
documents, etc.). |
|
""" |
|
type: Literal["Document"] = "Document" |
|
|
|
|
|
|
|
|
|
split_strat = ["\n\n", "\n", ". ", "! ", "? "] |
|
chunk_size = 500 |
|
chunk_overlap = 0 |
|
start_index = True |
|
|
|
|
|
def determine_file_type(file_path): |
|
""" |
|
Determine the file type based on its extension. |
|
|
|
Parameters: |
|
file_path (str): Path to the file. |
|
|
|
Returns: |
|
str: File extension (e.g., '.pdf', '.docx', '.txt', '.html'). |
|
""" |
|
return os.path.splitext(file_path)[1].lower() |
|
|
|
def parse_file(file_paths, text_column='text'): |
|
""" |
|
Accepts a list of file paths, determines each file's type based on its extension, |
|
and passes it to the relevant parsing function. |
|
|
|
Parameters: |
|
file_paths (list): List of file paths. |
|
text_column (str): Name of the column in CSV/Excel files that contains the text content. |
|
|
|
Returns: |
|
dict: A dictionary with file paths as keys and their parsed content (or error message) as values. |
|
""" |
|
|
|
|
|
|
|
if not isinstance(file_paths, list): |
|
raise ValueError("Expected a list of file paths.") |
|
|
|
extension_to_parser = { |
|
|
|
|
|
|
|
|
|
|
|
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column), |
|
'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column), |
|
'.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column) |
|
} |
|
|
|
parsed_contents = {} |
|
file_names = [] |
|
|
|
for file_path in file_paths: |
|
print(file_path.name) |
|
|
|
|
|
file_extension = determine_file_type(file_path.name) |
|
if file_extension in extension_to_parser: |
|
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name) |
|
else: |
|
parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}" |
|
|
|
filename_end = get_file_path_end(file_path.name) |
|
|
|
file_names.append(filename_end) |
|
|
|
return parsed_contents, file_names |
|
|
|
def text_regex_clean(text): |
|
|
|
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) |
|
|
|
text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text) |
|
|
|
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) |
|
|
|
text = re.sub(r"\n\s*\n", "\n\n", text) |
|
text = re.sub(r" ", " ", text) |
|
|
|
text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text) |
|
|
|
return text |
|
|
|
def parse_csv_or_excel(file_path, text_column = "text"): |
|
""" |
|
Read in a CSV or Excel file. |
|
|
|
Parameters: |
|
file_path (str): Path to the CSV file. |
|
text_column (str): Name of the column in the CSV file that contains the text content. |
|
|
|
Returns: |
|
Pandas DataFrame: Dataframe output from file read |
|
""" |
|
|
|
|
|
|
|
file_list = [string.name for string in file_path] |
|
|
|
print(file_list) |
|
|
|
data_file_names = [string for string in file_list if "tokenised" not in string] |
|
|
|
|
|
|
|
file_extension = determine_file_type(data_file_names[0]) |
|
file_name = get_file_path_end(data_file_names[0]) |
|
file_names = [file_name] |
|
|
|
print(file_extension) |
|
|
|
if file_extension == ".csv": |
|
df = pd.read_csv(data_file_names[0], low_memory=False) |
|
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name'] |
|
df['source'] = file_name |
|
df['page_section'] = "" |
|
elif file_extension == ".xlsx": |
|
df = pd.read_excel(data_file_names[0], engine='openpyxl') |
|
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name'] |
|
df['source'] = file_name |
|
df['page_section'] = "" |
|
elif file_extension == ".parquet": |
|
df = pd.read_parquet(data_file_names[0]) |
|
if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name'] |
|
df['source'] = file_name |
|
df['page_section'] = "" |
|
else: |
|
print(f"Unsupported file type: {file_extension}") |
|
return pd.DataFrame(), ['Please choose a valid file type'] |
|
|
|
message = "Loaded in file. Now converting to document format." |
|
print(message) |
|
|
|
return df, file_names, message |
|
|
|
def get_file_path_end(file_path): |
|
match = re.search(r'(.*[\/\\])?(.+)$', file_path) |
|
|
|
filename_end = match.group(2) if match else '' |
|
|
|
return filename_end |
|
|
|
|
|
|
|
|
|
|
|
def text_to_docs(text_dict: dict, chunk_size: int = chunk_size) -> List[Document]: |
|
""" |
|
Converts the output of parse_file (a dictionary of file paths to content) |
|
to a list of Documents with metadata. |
|
""" |
|
|
|
doc_sections = [] |
|
parent_doc_sections = [] |
|
|
|
for file_path, content in text_dict.items(): |
|
ext = os.path.splitext(file_path)[1].lower() |
|
|
|
|
|
|
|
|
|
|
|
|
|
if ext in ['.csv', '.xlsx']: |
|
docs, page_docs = csv_excel_text_to_docs(content, chunk_size) |
|
else: |
|
print(f"Unsupported file type {ext} for {file_path}. Skipping.") |
|
continue |
|
|
|
|
|
filename_end = get_file_path_end(file_path) |
|
|
|
|
|
|
|
|
|
|
|
for doc in docs: doc.metadata["source"] = filename_end |
|
|
|
|
|
doc_sections.extend(docs) |
|
|
|
|
|
return doc_sections |
|
|
|
|
|
def write_out_metadata_as_string(metadata_in): |
|
|
|
if isinstance(metadata_in, dict): |
|
metadata_in = [metadata_in] |
|
|
|
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] |
|
return metadata_string |
|
|
|
def combine_metadata_columns(df, cols): |
|
|
|
df['metadatas'] = "{" |
|
df['blank_column'] = "" |
|
|
|
for n, col in enumerate(cols): |
|
df[col] = df[col].astype(str).str.replace('"',"'").str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\r\n', ' ').str.cat(df['blank_column'].astype(str), sep="") |
|
|
|
df['metadatas'] = df['metadatas'] + '"' + cols[n] + '": "' + df[col] + '", ' |
|
|
|
|
|
df['metadatas'] = (df['metadatas'] + "}").str.replace(', }', '}') |
|
|
|
return df['metadatas'] |
|
|
|
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None) -> List[Document]: |
|
"""Converts a DataFrame's content to a list of Documents with metadata.""" |
|
|
|
|
|
|
|
print("Converting to documents.") |
|
|
|
doc_sections = [] |
|
df[text_column] = df[text_column].astype(str) |
|
|
|
|
|
for idx, row in df.iterrows(): |
|
|
|
doc_content = row[text_column] |
|
|
|
|
|
metadata = {"row": idx + 1} |
|
for col, value in row.items(): |
|
if col != text_column: |
|
metadata[col] = value |
|
|
|
metadata_string = write_out_metadata_as_string(metadata)[0] |
|
|
|
|
|
if chunk_size: |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=chunk_size, |
|
chunk_overlap=chunk_overlap, |
|
split_strat=split_strat, |
|
start_index=start_index |
|
) |
|
|
|
sections = text_splitter.split_text(doc_content) |
|
|
|
|
|
|
|
for i, section in enumerate(sections): |
|
section = '. '.join([metadata_string, section]) |
|
doc = Document(page_content=section, |
|
metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"}) |
|
doc_sections.append(doc) |
|
|
|
|
|
|
|
else: |
|
|
|
|
|
doc = Document(page_content=doc_content, metadata=metadata) |
|
doc_sections.append(doc) |
|
|
|
message = "Data converted to document format. Now creating/loading document embeddings." |
|
print(message) |
|
|
|
return doc_sections, message |
|
|
|
|
|
|
|
def clean_line_breaks(text): |
|
|
|
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ') |
|
|
|
def parse_metadata(row): |
|
try: |
|
|
|
|
|
|
|
|
|
|
|
row_str = str(row) if not isinstance(row, str) else row |
|
|
|
row_str.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ') |
|
|
|
|
|
metadata = ast.literal_eval(row_str) |
|
|
|
return metadata |
|
except SyntaxError as e: |
|
print(f"Failed to parse metadata: {row_str}") |
|
print(f"Error: {e}") |
|
|
|
return None |
|
|
|
def csv_excel_text_to_docs(df, text_column='text', chunk_size=None, progress=gr.Progress()) -> List[Document]: |
|
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata.""" |
|
|
|
ingest_tic = time.perf_counter() |
|
|
|
doc_sections = [] |
|
df[text_column] = df[text_column].astype(str).str.strip() |
|
|
|
cols = [col for col in df.columns if col != text_column] |
|
|
|
df["metadata"] = combine_metadata_columns(df, cols) |
|
|
|
df = df.rename(columns={text_column:"page_content"}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
doc_sections = [Document(page_content=row['page_content'], |
|
metadata= parse_metadata(row["metadata"])) |
|
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")] |
|
|
|
ingest_toc = time.perf_counter() |
|
|
|
ingest_time_out = f"Preparing documents took {ingest_toc - ingest_tic:0.1f} seconds" |
|
print(ingest_time_out) |
|
|
|
return doc_sections, "Finished splitting documents" |
|
|
|
|
|
|
|
def pull_out_data(series): |
|
|
|
|
|
to_tuple = lambda x: eval(x) |
|
|
|
|
|
series_tup = series.apply(to_tuple) |
|
|
|
series_tup_content = list(zip(*series_tup))[1] |
|
|
|
series = pd.Series(list(series_tup_content)) |
|
|
|
return series |
|
|
|
def docs_from_csv(df): |
|
|
|
import ast |
|
|
|
documents = [] |
|
|
|
page_content = pull_out_data(df["0"]) |
|
metadatas = pull_out_data(df["1"]) |
|
|
|
for x in range(0,len(df)): |
|
new_doc = Document(page_content=page_content[x], metadata=metadatas[x]) |
|
documents.append(new_doc) |
|
|
|
return documents |
|
|
|
def docs_from_lists(docs, metadatas): |
|
|
|
documents = [] |
|
|
|
for x, doc in enumerate(docs): |
|
new_doc = Document(page_content=doc, metadata=metadatas[x]) |
|
documents.append(new_doc) |
|
|
|
return documents |
|
|
|
def docs_elements_from_csv_save(docs_path="documents.csv"): |
|
|
|
documents = pd.read_csv(docs_path) |
|
|
|
docs_out = docs_from_csv(documents) |
|
|
|
out_df = pd.DataFrame(docs_out) |
|
|
|
docs_content = pull_out_data(out_df[0].astype(str)) |
|
|
|
docs_meta = pull_out_data(out_df[1].astype(str)) |
|
|
|
doc_sources = [d['source'] for d in docs_meta] |
|
|
|
return out_df, docs_content, docs_meta, doc_sources |
|
|