Sean-Case
Fixed data input for semantic search. Allowed for docs to be loaded in directly for semantic search. 0.2.1
3df8e40
# Install/ import stuff we need | |
import os | |
import time | |
import re | |
import ast | |
import gzip | |
import pandas as pd | |
import gradio as gr | |
from typing import Type, List, Literal | |
#from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from pydantic import BaseModel, Field | |
# Creating an alias for pandas DataFrame using Type | |
PandasDataFrame = Type[pd.DataFrame] | |
class Document(BaseModel): | |
"""Class for storing a piece of text and associated metadata. Implementation adapted from Langchain code: https://github.com/langchain-ai/langchain/blob/master/libs/core/langchain_core/documents/base.py""" | |
page_content: str | |
"""String text.""" | |
metadata: dict = Field(default_factory=dict) | |
"""Arbitrary metadata about the page content (e.g., source, relationships to other | |
documents, etc.). | |
""" | |
type: Literal["Document"] = "Document" | |
# Constants for chunking - not currently used | |
split_strat = ["\n\n", "\n", ". ", "! ", "? "] | |
chunk_size = 512 | |
chunk_overlap = 0 | |
start_index = True | |
from search_funcs.helper_functions import get_file_path_end_with_ext, detect_file_type, get_file_path_end | |
from search_funcs.bm25_functions import save_prepared_bm25_data | |
from search_funcs.clean_funcs import initial_clean | |
## Parse files | |
# def detect_file_type(file_path): | |
# """ | |
# Determine the file type based on its extension. | |
# Parameters: | |
# file_path (str): Path to the file. | |
# Returns: | |
# str: File extension (e.g., '.pdf', '.docx', '.txt', '.html'). | |
# """ | |
# return os.path.splitext(file_path)[1].lower() | |
def parse_file_not_used(file_paths, text_column='text'): | |
""" | |
Accepts a list of file paths, determines each file's type based on its extension, | |
and passes it to the relevant parsing function. | |
Parameters: | |
file_paths (list): List of file paths. | |
text_column (str): Name of the column in CSV/Excel files that contains the text content. | |
Returns: | |
dict: A dictionary with file paths as keys and their parsed content (or error message) as values. | |
""" | |
if not isinstance(file_paths, list): | |
raise ValueError("Expected a list of file paths.") | |
extension_to_parser = { | |
# '.pdf': parse_pdf, | |
# '.docx': parse_docx, | |
# '.txt': parse_txt, | |
# '.html': parse_html, | |
# '.htm': parse_html, # Considering both .html and .htm for HTML files | |
'.csv': lambda file_path: parse_csv_or_excel(file_path, text_column), | |
'.xlsx': lambda file_path: parse_csv_or_excel(file_path, text_column), | |
'.parquet': lambda file_path: parse_csv_or_excel(file_path, text_column) | |
} | |
parsed_contents = {} | |
file_names = [] | |
for file_path in file_paths: | |
#print(file_path.name) | |
#file = open(file_path.name, 'r') | |
#print(file) | |
file_extension = detect_file_type(file_path.name) | |
if file_extension in extension_to_parser: | |
parsed_contents[file_path.name] = extension_to_parser[file_extension](file_path.name) | |
else: | |
parsed_contents[file_path.name] = f"Unsupported file type: {file_extension}" | |
filename_end = get_file_path_end_with_ext(file_path.name) | |
file_names.append(filename_end) | |
return parsed_contents, file_names | |
def text_regex_clean(text): | |
# Merge hyphenated words | |
text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text) | |
# If a double newline ends in a letter, add a full stop. | |
text = re.sub(r'(?<=[a-zA-Z])\n\n', '.\n\n', text) | |
# Fix newlines in the middle of sentences | |
text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip()) | |
# Remove multiple newlines | |
text = re.sub(r"\n\s*\n", "\n\n", text) | |
text = re.sub(r" ", " ", text) | |
# Add full stops and new lines between words with no space between where the second one has a capital letter | |
text = re.sub(r'(?<=[a-z])(?=[A-Z])', '. \n\n', text) | |
return text | |
def parse_csv_or_excel(file_path, data_state, text_column = "text"): | |
""" | |
Read in a CSV or Excel file. | |
Parameters: | |
file_path (str): Path to the CSV file. | |
text_column (str): Name of the column in the CSV file that contains the text content. | |
Returns: | |
Pandas DataFrame: Dataframe output from file read | |
""" | |
#out_df = pd.DataFrame() | |
file_list = [string.name for string in file_path] | |
#print(file_list) | |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string.lower() and "npz" not in string.lower()]# and "gz" not in string.lower()] | |
data_file_name = data_file_names[0] | |
#for file_path in file_paths: | |
file_name = get_file_path_end_with_ext(data_file_name) | |
#print(file_extension) | |
# if file_extension == "csv": | |
# df = pd.read_csv(data_file_names[0], low_memory=False) | |
# if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name'] | |
# df['source'] = file_name | |
# df['page_section'] = "" | |
# elif file_extension == "xlsx": | |
# df = pd.read_excel(data_file_names[0], engine='openpyxl') | |
# if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name'] | |
# df['source'] = file_name | |
# df['page_section'] = "" | |
# elif file_extension == "parquet": | |
# df = pd.read_parquet(data_file_names[0]) | |
# if text_column not in df.columns: return pd.DataFrame(), ['Please choose a valid column name'] | |
# df['source'] = file_name | |
# df['page_section'] = "" | |
# else: | |
# print(f"Unsupported file type: {file_extension}") | |
# return pd.DataFrame(), ['Please choose a valid file type'] | |
df = data_state | |
#df['source'] = file_name | |
#df['page_section'] = "" | |
message = "Loaded in file. Now converting to document format." | |
print(message) | |
return df, file_name, message | |
# + | |
# Convert parsed text to docs | |
# - | |
def write_out_metadata_as_string(metadata_in): | |
# If metadata_in is a single dictionary, wrap it in a list | |
if isinstance(metadata_in, dict): | |
metadata_in = [metadata_in] | |
metadata_string = [f"{' '.join(f'{k}: {v}' for k, v in d.items() if k != 'page_section')}" for d in metadata_in] # ['metadata'] | |
return metadata_string | |
def combine_metadata_columns(df, cols): | |
df['metadata'] = '{' | |
df['blank_column'] = '' | |
for n, col in enumerate(cols): | |
df[col] = df[col].astype(str).str.replace('"',"'").str.replace('\n', ' ').str.replace('\r', ' ').str.replace('\r\n', ' ').str.cat(df['blank_column'].astype(str), sep="") | |
df['metadata'] = df['metadata'] + '"' + cols[n] + '": "' + df[col] + '", ' | |
df['metadata'] = (df['metadata'] + "}").str.replace(', }', '}').str.replace('", }"', '}') | |
return df['metadata'] | |
def split_string_into_chunks(input_string, max_length, split_symbols): | |
# Check if input_string or split_symbols are empty | |
if not input_string or not split_symbols: | |
return [input_string] | |
chunks = [] | |
current_chunk = "" | |
for char in input_string: | |
current_chunk += char | |
if len(current_chunk) >= max_length or char in split_symbols: | |
# Add the current chunk to the chunks list | |
chunks.append(current_chunk) | |
current_chunk = "" | |
# Adding any remaining part of the string | |
if current_chunk: | |
chunks.append(current_chunk) | |
return chunks | |
def clean_line_breaks(text): | |
# Replace \n and \r\n with a space | |
return text.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ') | |
def parse_metadata(row): | |
try: | |
# Ensure the 'title' field is a string and clean line breaks | |
#if 'TITLE' in row: | |
# row['TITLE'] = clean_line_breaks(row['TITLE']) | |
# Convert the row to a string if it's not already | |
row_str = str(row) if not isinstance(row, str) else row | |
row_str.replace('\n', ' ').replace('\r', ' ').replace('\r\n', ' ') | |
# Parse the string | |
metadata = ast.literal_eval(row_str) | |
# Process metadata | |
return metadata | |
except SyntaxError as e: | |
print(f"Failed to parse metadata: {row_str}") | |
print(f"Error: {e}") | |
# Handle the error or log it | |
return None # or some default value | |
# def csv_excel_text_to_docs_deprecated(df, text_column='text', chunk_size=None) -> List[Document]: | |
# """Converts a DataFrame's content to a list of Documents with metadata.""" | |
# print("Converting to documents.") | |
# doc_sections = [] | |
# df[text_column] = df[text_column].astype(str) # Ensure column is a string column | |
# # For each row in the dataframe | |
# for idx, row in df.iterrows(): | |
# # Extract the text content for the document | |
# doc_content = row[text_column] | |
# # Generate metadata containing other columns' data | |
# metadata = {"row": idx + 1} | |
# for col, value in row.items(): | |
# if col != text_column: | |
# metadata[col] = value | |
# metadata_string = write_out_metadata_as_string(metadata)[0] | |
# # If chunk_size is provided, split the text into chunks | |
# if chunk_size: | |
# sections = split_string_into_chunks(doc_content, chunk_size, split_strat) | |
# # Langchain usage deprecated | |
# # text_splitter = RecursiveCharacterTextSplitter( | |
# # chunk_size=chunk_size, | |
# # chunk_overlap=chunk_overlap, | |
# # split_strat=split_strat, | |
# # start_index=start_index | |
# # ) #Other arguments as required by the splitter | |
# # sections = text_splitter.split_text(doc_content) | |
# # For each section, create a Document object | |
# for i, section in enumerate(sections): | |
# section = '. '.join([metadata_string, section]) | |
# doc = Document(page_content=section, | |
# metadata={**metadata, "section": i, "row_section": f"{metadata['row']}-{i}"}) | |
# doc_sections.append(doc) | |
# else: | |
# # If no chunk_size is provided, create a single Document object for the row | |
# #doc_content = '. '.join([metadata_string, doc_content]) | |
# doc = Document(page_content=doc_content, metadata=metadata) | |
# doc_sections.append(doc) | |
# message = "Data converted to document format. Now creating/loading document embeddings." | |
# print(message) | |
# return doc_sections, message | |
def csv_excel_text_to_docs(df, in_file, text_column='text', clean = "No", return_intermediate_files = "No", chunk_size=None, progress=gr.Progress()) -> List[Document]: | |
"""Converts a DataFrame's content to a list of dictionaries in the 'Document' format, containing page_content and associated metadata.""" | |
file_list = [string.name for string in in_file] | |
data_file_names = [string.lower() for string in file_list if "tokenised" not in string and "npz" not in string.lower()] | |
data_file_name = data_file_names[0] | |
# Check if file is a document format, and explode out as needed | |
if "prepared_docs" in data_file_name: | |
print("Loading in documents from file.") | |
#print(df[0:5]) | |
#section_series = df.iloc[:,0] | |
#section_series = "{" + section_series + "}" | |
doc_sections = df | |
#print(doc_sections[0]) | |
# Convert each element in the Series to a Document instance | |
#doc_sections = section_series.apply(lambda x: Document(**x)) | |
return doc_sections, "Finished preparing documents" | |
# df = document_to_dataframe(df.iloc[:,0]) | |
ingest_tic = time.perf_counter() | |
doc_sections = [] | |
df[text_column] = df[text_column].astype(str).str.strip() # Ensure column is a string column | |
if clean == "Yes": | |
clean_tic = time.perf_counter() | |
print("Starting data clean.") | |
#df = df.drop_duplicates(text_column) | |
df[text_column] = initial_clean(df[text_column]) | |
df_list = list(df[text_column]) | |
# Save to file if you have cleaned the data | |
out_file_name, text_column = save_prepared_bm25_data(data_file_name, df_list, df, text_column) | |
clean_toc = time.perf_counter() | |
clean_time_out = f"Cleaning the text took {clean_toc - clean_tic:0.1f} seconds." | |
print(clean_time_out) | |
cols = [col for col in df.columns if col != text_column] | |
df["metadata"] = combine_metadata_columns(df, cols) | |
df = df.rename(columns={text_column:"page_content"}) | |
#print(df[["page_content", "metadata"]].to_dict(orient='records')) | |
#doc_sections = df[["page_content", "metadata"]].to_dict(orient='records') | |
#doc_sections = [Document(**row) for row in df[["page_content", "metadata"]].to_dict(orient='records')] | |
# Create a list of Document objects | |
doc_sections = [Document(page_content=row['page_content'], | |
metadata= parse_metadata(row["metadata"])) | |
for index, row in progress.tqdm(df.iterrows(), desc = "Splitting up text", unit = "rows")] | |
ingest_toc = time.perf_counter() | |
ingest_time_out = f"Preparing documents took {ingest_toc - ingest_tic:0.1f} seconds" | |
print(ingest_time_out) | |
if return_intermediate_files == "Yes": | |
data_file_out_name_no_ext = get_file_path_end(data_file_name) | |
file_name = data_file_out_name_no_ext | |
#print(doc_sections) | |
#page_content_series_string = pd.Series(doc_sections).astype(str) | |
#page_content_series_string = page_content_series_string.str.replace(" type='Document'", "").str.replace("' metadata=", "', 'metadata':").str.replace("page_content=", "{'page_content':") | |
#page_content_series_string = page_content_series_string + "}" | |
#print(page_content_series_string[0]) | |
#metadata_series_string = pd.Series(doc_sections[1]).astype(str) | |
import pickle | |
if clean == "No": | |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs.parquet") | |
with gzip.open(file_name + "_prepared_docs.pkl.gz", 'wb') as file: | |
pickle.dump(doc_sections, file) | |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs.pkl") | |
elif clean == "Yes": | |
#pd.DataFrame(data = {"Documents":page_content_series_string}).to_parquet(file_name + "_prepared_docs_clean.parquet") | |
with gzip.open(file_name + "_prepared_docs_clean.pkl.gz", 'wb') as file: | |
pickle.dump(doc_sections, file) | |
#pd.Series(doc_sections).to_pickle(file_name + "_prepared_docs_clean.pkl") | |
print("Documents saved to file.") | |
return doc_sections, "Finished preparing documents." | |
def document_to_dataframe(documents): | |
''' | |
Convert an object in document format to pandas dataframe | |
''' | |
rows = [] | |
for doc in documents: | |
# Convert Document to dictionary and extract metadata | |
doc_dict = doc.dict() | |
metadata = doc_dict.pop('metadata') | |
# Add the page_content and type to the metadata | |
metadata['page_content'] = doc_dict['page_content'] | |
metadata['type'] = doc_dict['type'] | |
# Add to the list of rows | |
rows.append(metadata) | |
# Create a DataFrame from the list of rows | |
df = pd.DataFrame(rows) | |
return df | |
# Example usage | |
#documents = [ | |
# Document(page_content="Example content 1", metadata={"author": "Author 1", "year": 2021}), | |
# Document(page_content="Example content 2", metadata={"author": "Author 2", "year": 2022}) | |
#] | |
#df = document_to_dataframe(documents) | |
#df |